import numpy as np
import pandas as pd
df=pd.read_csv('Diwali Sales Data.csv',encoding='latin1')
df.head()
| User_ID | Cust_name | Product_ID | Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Product_Category | Orders | Amount | Status | unnamed1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1002903 | Sanskriti | P00125942 | F | 26-35 | 28 | 0 | Maharashtra | Western | Healthcare | Auto | 1 | 23952.0 | NaN | NaN |
| 1 | 1000732 | Kartik | P00110942 | F | 26-35 | 35 | 1 | Andhra Pradesh | Southern | Govt | Auto | 3 | 23934.0 | NaN | NaN |
| 2 | 1001990 | Bindu | P00118542 | F | 26-35 | 35 | 1 | Uttar Pradesh | Central | Automobile | Auto | 3 | 23924.0 | NaN | NaN |
| 3 | 1001425 | Sudevi | P00237842 | M | 0-17 | 16 | 0 | Karnataka | Southern | Construction | Auto | 2 | 23912.0 | NaN | NaN |
| 4 | 1000588 | Joni | P00057942 | M | 26-35 | 28 | 1 | Gujarat | Western | Food Processing | Auto | 2 | 23877.0 | NaN | NaN |
df.shape
(11251, 15)
!pip install -U ydata-profiling
Requirement already satisfied: ydata-profiling in c:\users\91901\documents\anaconda1\lib\site-packages (4.6.1) Requirement already satisfied: imagehash==4.3.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (4.3.1) Requirement already satisfied: tqdm<5,>=4.48.2 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (4.64.1) Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.12.3) Requirement already satisfied: wordcloud>=1.9.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (1.9.2) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (6.0) Requirement already satisfied: visions[type_image_path]==0.7.5 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.7.5) Requirement already satisfied: statsmodels<1,>=0.13.2 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.13.5) Requirement already satisfied: dacite>=1.8 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (1.8.1) Requirement already satisfied: htmlmin==0.1.12 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.1.12) Requirement already satisfied: pydantic>=2 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (2.4.2) Requirement already satisfied: seaborn<0.13,>=0.10.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.12.2) Requirement already satisfied: pandas!=1.4.0,<2.1,>1.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (1.5.3) Requirement already satisfied: numpy<1.26,>=1.16.0 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (1.23.5) Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (3.1.2) Requirement already satisfied: typeguard<5,>=4.1.2 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (4.1.5) Requirement already satisfied: numba<0.59.0,>=0.56.0 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (0.56.4) Requirement already satisfied: matplotlib<=3.7.3,>=3.2 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (3.6.3) Requirement already satisfied: multimethod<2,>=1.4 in c:\users\91901\documents\anaconda1\lib\site-packages (from ydata-profiling) (1.10) Requirement already satisfied: scipy<1.12,>=1.4.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (1.10.0) Requirement already satisfied: requests<3,>=2.24.0 in c:\users\91901\appdata\roaming\python\python310\site-packages (from ydata-profiling) (2.28.2) Requirement already satisfied: PyWavelets in c:\users\91901\documents\anaconda1\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (1.4.1) Requirement already satisfied: pillow in c:\users\91901\appdata\roaming\python\python310\site-packages (from imagehash==4.3.1->ydata-profiling) (9.4.0) Requirement already satisfied: attrs>=19.3.0 in c:\users\91901\documents\anaconda1\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (22.1.0) Requirement already satisfied: networkx>=2.4 in c:\users\91901\documents\anaconda1\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (2.8.4) Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in c:\users\91901\documents\anaconda1\lib\site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (0.2.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\91901\appdata\roaming\python\python310\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.1.2) Requirement already satisfied: cycler>=0.10 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (0.11.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (1.4.4) Requirement already satisfied: python-dateutil>=2.7 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (2.8.2) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (3.0.9) Requirement already satisfied: packaging>=20.0 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (23.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (4.38.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from matplotlib<=3.7.3,>=3.2->ydata-profiling) (1.0.7) Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in c:\users\91901\documents\anaconda1\lib\site-packages (from numba<0.59.0,>=0.56.0->ydata-profiling) (0.39.1) Requirement already satisfied: setuptools in c:\users\91901\documents\anaconda1\lib\site-packages (from numba<0.59.0,>=0.56.0->ydata-profiling) (65.6.3) Requirement already satisfied: pytz>=2020.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from pandas!=1.4.0,<2.1,>1.1->ydata-profiling) (2022.7.1) Requirement already satisfied: joblib>=0.14.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.2.0) Requirement already satisfied: annotated-types>=0.4.0 in c:\users\91901\documents\anaconda1\lib\site-packages (from pydantic>=2->ydata-profiling) (0.6.0) Requirement already satisfied: typing-extensions>=4.6.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from pydantic>=2->ydata-profiling) (4.8.0) Requirement already satisfied: pydantic-core==2.10.1 in c:\users\91901\documents\anaconda1\lib\site-packages (from pydantic>=2->ydata-profiling) (2.10.1) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\91901\appdata\roaming\python\python310\site-packages (from requests<3,>=2.24.0->ydata-profiling) (1.26.14) Requirement already satisfied: idna<4,>=2.5 in c:\users\91901\appdata\roaming\python\python310\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.4) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\91901\appdata\roaming\python\python310\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.0.1) Requirement already satisfied: certifi>=2017.4.17 in c:\users\91901\appdata\roaming\python\python310\site-packages (from requests<3,>=2.24.0->ydata-profiling) (2022.12.7) Requirement already satisfied: patsy>=0.5.2 in c:\users\91901\documents\anaconda1\lib\site-packages (from statsmodels<1,>=0.13.2->ydata-profiling) (0.5.3) Requirement already satisfied: colorama in c:\users\91901\appdata\roaming\python\python310\site-packages (from tqdm<5,>=4.48.2->ydata-profiling) (0.4.6) Requirement already satisfied: six in c:\users\91901\appdata\roaming\python\python310\site-packages (from patsy>=0.5.2->statsmodels<1,>=0.13.2->ydata-profiling) (1.16.0)
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="DataSet Profile Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
profile.to_widgets()
Render widgets: 0%| | 0/1 [00:00<?, ?it/s]
VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…
profile.to_notebook_iframe()
orgin_df=df[['Gender','Age Group','Age','Marital_Status','State','Zone','Occupation','Orders','Amount']]
orgin_df.head()
| Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Orders | Amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | F | 26-35 | 28 | 0 | Maharashtra | Western | Healthcare | 1 | 23952.0 |
| 1 | F | 26-35 | 35 | 1 | Andhra Pradesh | Southern | Govt | 3 | 23934.0 |
| 2 | F | 26-35 | 35 | 1 | Uttar Pradesh | Central | Automobile | 3 | 23924.0 |
| 3 | M | 0-17 | 16 | 0 | Karnataka | Southern | Construction | 2 | 23912.0 |
| 4 | M | 26-35 | 28 | 1 | Gujarat | Western | Food Processing | 2 | 23877.0 |
orgin_df.shape
(11251, 9)
orgin_df.isnull().sum()
Gender 0 Age Group 0 Age 0 Marital_Status 0 State 0 Zone 0 Occupation 0 Orders 0 Amount 12 dtype: int64
orgin_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11251 entries, 0 to 11250 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 11251 non-null object 1 Age Group 11251 non-null object 2 Age 11251 non-null int64 3 Marital_Status 11251 non-null int64 4 State 11251 non-null object 5 Zone 11251 non-null object 6 Occupation 11251 non-null object 7 Orders 11251 non-null int64 8 Amount 11239 non-null float64 dtypes: float64(1), int64(3), object(5) memory usage: 791.2+ KB
amount_mean=orgin_df['Amount'].mean()
orgin_df['Amount']=orgin_df['Amount'].fillna(value=amount_mean)
C:\Users\91901\AppData\Local\Temp\ipykernel_16992\3564231867.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy orgin_df['Amount']=orgin_df['Amount'].fillna(value=amount_mean)
orgin_df['Amount'].isnull().sum()
0
X=orgin_df.drop(columns=['Amount'])
y=orgin_df['Amount']
X.head()
| Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Orders | |
|---|---|---|---|---|---|---|---|---|
| 0 | F | 26-35 | 28 | 0 | Maharashtra | Western | Healthcare | 1 |
| 1 | F | 26-35 | 35 | 1 | Andhra Pradesh | Southern | Govt | 3 |
| 2 | F | 26-35 | 35 | 1 | Uttar Pradesh | Central | Automobile | 3 |
| 3 | M | 0-17 | 16 | 0 | Karnataka | Southern | Construction | 2 |
| 4 | M | 26-35 | 28 | 1 | Gujarat | Western | Food Processing | 2 |
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((8438, 8), (2813, 8), (8438,), (2813,))
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
oht=OneHotEncoder(handle_unknown='ignore',sparse_output=False)
std=StandardScaler()
X_train
| Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Orders | |
|---|---|---|---|---|---|---|---|---|
| 11070 | F | 26-35 | 30 | 0 | Madhya Pradesh | Central | Banking | 3 |
| 7621 | F | 55+ | 73 | 1 | Haryana | Northern | Healthcare | 4 |
| 10796 | F | 18-25 | 22 | 0 | Gujarat | Western | Construction | 1 |
| 10239 | M | 26-35 | 29 | 0 | Maharashtra | Western | Banking | 2 |
| 5652 | M | 46-50 | 46 | 0 | Karnataka | Southern | Banking | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5734 | F | 18-25 | 20 | 0 | Bihar | Eastern | Banking | 3 |
| 5191 | F | 46-50 | 47 | 0 | Uttarakhand | Central | Aviation | 1 |
| 5390 | F | 46-50 | 49 | 0 | Kerala | Southern | Construction | 1 |
| 860 | F | 18-25 | 19 | 1 | Bihar | Eastern | Aviation | 2 |
| 7270 | F | 36-45 | 39 | 1 | Karnataka | Southern | Construction | 3 |
8438 rows × 8 columns
X_test
| Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Orders | |
|---|---|---|---|---|---|---|---|---|
| 5749 | M | 26-35 | 30 | 0 | Haryana | Northern | Banking | 4 |
| 7335 | F | 26-35 | 31 | 0 | Uttar Pradesh | Central | Retail | 2 |
| 5932 | M | 26-35 | 28 | 1 | Uttarakhand | Central | Automobile | 3 |
| 3819 | F | 18-25 | 22 | 0 | Karnataka | Southern | Chemical | 2 |
| 6395 | F | 18-25 | 22 | 0 | Rajasthan | Northern | Food Processing | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9112 | M | 18-25 | 22 | 1 | Haryana | Northern | IT Sector | 3 |
| 2269 | M | 36-45 | 44 | 0 | Andhra Pradesh | Southern | Banking | 4 |
| 9030 | M | 46-50 | 46 | 0 | Bihar | Eastern | Hospitality | 1 |
| 9966 | F | 26-35 | 32 | 1 | Karnataka | Southern | Healthcare | 3 |
| 7528 | M | 36-45 | 42 | 1 | Maharashtra | Western | Banking | 1 |
2813 rows × 8 columns
X_train_scaled=oht.fit_transform(X_train)
X_test_scaled=oht.transform(X_test)
X_train_scaled
array([[1., 0., 0., ..., 0., 1., 0.],
[1., 0., 0., ..., 0., 0., 1.],
[1., 0., 0., ..., 0., 0., 0.],
...,
[1., 0., 0., ..., 0., 0., 0.],
[1., 0., 0., ..., 1., 0., 0.],
[1., 0., 0., ..., 0., 1., 0.]])
X_test_scaled
array([[0., 1., 0., ..., 0., 0., 1.],
[1., 0., 0., ..., 1., 0., 0.],
[0., 1., 0., ..., 0., 1., 0.],
...,
[0., 1., 0., ..., 0., 0., 0.],
[1., 0., 0., ..., 0., 1., 0.],
[0., 1., 0., ..., 0., 0., 0.]])
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
svr=SVR()
dtr=DecisionTreeRegressor()
knr=KNeighborsRegressor()
svr.fit(X_train_scaled,y_train)
SVR()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVR()
dtr.fit(X_train_scaled,y_train)
DecisionTreeRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeRegressor()
knr.fit(X_train_scaled,y_train)
KNeighborsRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsRegressor()
y_pred_svr=svr.predict(X_test_scaled)
y_pred_dtr=dtr.predict(X_test_scaled)
y_pred_knr=knr.predict(X_test_scaled)
y_pred_svr
array([8089.09506714, 8130.44530743, 8096.58316785, ..., 8099.84763544,
8119.37959829, 8103.41641902])
y_pred_dtr
array([ 7972., 11616., 2002., ..., 19515., 8535., 1681.])
y_pred_knr
array([ 7493.8, 11585.8, 8026. , ..., 10420.8, 11178.8, 7372. ])
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
print("Mean absolute error:- ",mean_absolute_error(y_test,y_pred_svr))
print("Mean squared error:- ",mean_squared_error(y_test,y_pred_svr))
print("Root Mean squared error:- ",np.sqrt(mean_absolute_error(y_test,y_pred_svr)))
Mean absolute error:- 4193.6740966798 Mean squared error:- 29459301.56008552 Root Mean squared error:- 64.75858318925607
print("Mean absolute error:- ",mean_absolute_error(y_test,y_pred_dtr))
print("Mean squared error:- ",mean_squared_error(y_test,y_pred_dtr))
print("Root Mean squared error:- ",np.sqrt(mean_absolute_error(y_test,y_pred_dtr)))
Mean absolute error:- 5654.507289388122 Mean squared error:- 51284745.25594983 Root Mean squared error:- 75.19645795772645
print("Mean absolute error:- ",mean_absolute_error(y_test,y_pred_knr))
print("Mean squared error:- ",mean_squared_error(y_test,y_pred_knr))
print("Root Mean squared error:- ",np.sqrt(mean_absolute_error(y_test,y_pred_knr)))
Mean absolute error:- 4519.1313660053975 Mean squared error:- 31316789.272688925 Root Mean squared error:- 67.224484869766
print("Score :- ",r2_score(y_test,y_pred_svr))
print("Score :- ",r2_score(y_test,y_pred_dtr))
print("Score :- ",r2_score(y_test,y_pred_knr))
Score :- -0.06779897129072521 Score :- -0.8588966922897798 Score :- -0.13512655082134173
test=pd.DataFrame([['M','46-50',50,0,'Uttar Pradesh','Central','Healthcare',3]],columns=['Gender','Age Group','Age','Marital_Status','State','Zone','Occupation','Orders'])
test
| Gender | Age Group | Age | Marital_Status | State | Zone | Occupation | Orders | |
|---|---|---|---|---|---|---|---|---|
| 0 | M | 46-50 | 50 | 0 | Uttar Pradesh | Central | Healthcare | 3 |
test_scaled=oht.transform(test)
svr.predict(test_scaled)
array([8115.10030671])
dtr.predict(test_scaled)
array([4536.])
knr.predict(test_scaled)
array([6332.2])